#Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Read data from CSV file
nf = pd.read_csv('netflix_titles.csv')
#Original data with first 5 rows
nf.head(5)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81145628 | Movie | Norm of the North: King Sized Adventure | Richard Finn, Tim Maltby | Alan Marriott, Andrew Toth, Brian Dobson, Cole... | United States, India, South Korea, China | September 9, 2019 | 2019 | TV-PG | 90 min | Children & Family Movies, Comedies | Before planning an awesome wedding for his gra... |
| 1 | 80117401 | Movie | Jandino: Whatever it Takes | NaN | Jandino Asporaat | United Kingdom | September 9, 2016 | 2016 | TV-MA | 94 min | Stand-Up Comedy | Jandino Asporaat riffs on the challenges of ra... |
| 2 | 70234439 | TV Show | Transformers Prime | NaN | Peter Cullen, Sumalee Montano, Frank Welker, J... | United States | September 8, 2018 | 2013 | TV-Y7-FV | 1 Season | Kids' TV | With the help of three human allies, the Autob... |
| 3 | 80058654 | TV Show | Transformers: Robots in Disguise | NaN | Will Friedle, Darren Criss, Constance Zimmer, ... | United States | September 8, 2018 | 2016 | TV-Y7 | 1 Season | Kids' TV | When a prison ship crash unleashes hundreds of... |
| 4 | 80125979 | Movie | #realityhigh | Fernando Lebrija | Nesta Cooper, Kate Walsh, John Michael Higgins... | United States | September 8, 2017 | 2017 | TV-14 | 99 min | Comedies | When nerdy high schooler Dani finally attracts... |
#Basic information about the dataset
nf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6234 entries, 0 to 6233 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 6234 non-null int64 1 type 6234 non-null object 2 title 6234 non-null object 3 director 4265 non-null object 4 cast 5664 non-null object 5 country 5758 non-null object 6 date_added 6223 non-null object 7 release_year 6234 non-null int64 8 rating 6224 non-null object 9 duration 6234 non-null object 10 listed_in 6234 non-null object 11 description 6234 non-null object dtypes: int64(2), object(10) memory usage: 584.6+ KB
print("There are {} rows and {} columns in the dataset.".format(nf.shape[0],nf.shape[1]))
There are 6234 rows and 12 columns in the dataset.
nf['type'].value_counts()
Movie 4265 TV Show 1969 Name: type, dtype: int64
#Create new columns with year added
nf['date_added'] = pd.to_datetime(nf['date_added'])
nf['year_added'] = nf['date_added'].dt.year
#Change release_year to numeric data
nf['year_release'] = nf['release_year'].dropna().apply(lambda x : int(x))
#Drop 'date_added' and 'release_year' because we got altervatives
nf.drop(['date_added','release_year'], axis=1, inplace=True)
#Unique values in rating columns
nf['rating'].unique()
array(['TV-PG', 'TV-MA', 'TV-Y7-FV', 'TV-Y7', 'TV-14', 'R', 'TV-Y', 'NR',
'PG-13', 'TV-G', 'PG', 'G', nan, 'UR', 'NC-17'], dtype=object)
#Function to count the number of contents having "minutes" as duration
#1: min
#0: seasons
def count(duration):
x = 0
if 'min' in duration:
x += 1
return x
dur = nf['duration'].dropna().apply(lambda x: count(x))
dur.value_counts()
1 4265 0 1969 Name: duration, dtype: int64
#Create a new column called "Season"
#Contain only number of seasons / "0" if it contains min in duration
nf['Season'] = nf['duration'].apply(lambda x: x[:2] if 'Season' in x else 0)
#Return only number of minutes / "0" if it contains season in duration
nf['duration'] = nf['duration'].apply(lambda x: x[:-4] if 'min' in x else 0)
#Change duration from object to int
#0 min --> 1969 has at least 1 season
nf['duration'] = pd.to_numeric(nf['duration'])
nf['duration'].value_counts()
0 1969
90 111
91 104
92 101
95 94
...
228 1
224 1
200 1
196 1
205 1
Name: duration, Length: 187, dtype: int64
#Change season int
#0 season --> 4265 contents have 1 episode
nf['Season'] = pd.to_numeric(nf['Season'])
nf['Season'].value_counts()
0 4265 1 1321 2 304 3 158 4 61 5 46 6 22 7 21 8 16 9 7 11 3 10 3 15 2 13 2 12 2 14 1 Name: Season, dtype: int64
#Overall missing data from each column
nf.isnull().sum()
show_id 0 type 0 title 0 director 1969 cast 570 country 476 rating 10 duration 0 listed_in 0 description 0 year_added 11 year_release 0 Season 0 dtype: int64
#Missing data in graph
plt.figure(figsize=(10,5))
sns.heatmap(nf.isnull())
plt.show()
nf.isnull().sum()/len(nf)*100
show_id 0.000000 type 0.000000 title 0.000000 director 31.584857 cast 9.143407 country 7.635547 rating 0.160411 duration 0.000000 listed_in 0.000000 description 0.000000 year_added 0.176452 year_release 0.000000 Season 0.000000 dtype: float64
nf.drop(['director', 'cast', 'description'], axis=1, inplace=True)
# Filling all the missing values in the 'country' column with United States
#as Netflix was created in the USA and every show is aired on Netflix US.
nf['country'].replace(np.nan, 'United States', inplace=True)
# Dropna to drop all other missing data as it conly accounts for 0.1% of the dataset
nf.dropna(inplace=True)
#Double check missing data table
nf.isnull().sum()/len(nf)*100
show_id 0.0 type 0.0 title 0.0 country 0.0 rating 0.0 duration 0.0 listed_in 0.0 year_added 0.0 year_release 0.0 Season 0.0 dtype: float64
#Review data after cleaning process
nf.head()
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81145628 | Movie | Norm of the North: King Sized Adventure | United States, India, South Korea, China | TV-PG | 90 | Children & Family Movies, Comedies | 2019.0 | 2019 | 0 |
| 1 | 80117401 | Movie | Jandino: Whatever it Takes | United Kingdom | TV-MA | 94 | Stand-Up Comedy | 2016.0 | 2016 | 0 |
| 2 | 70234439 | TV Show | Transformers Prime | United States | TV-Y7-FV | 0 | Kids' TV | 2018.0 | 2013 | 1 |
| 3 | 80058654 | TV Show | Transformers: Robots in Disguise | United States | TV-Y7 | 0 | Kids' TV | 2018.0 | 2016 | 1 |
| 4 | 80125979 | Movie | #realityhigh | United States | TV-14 | 99 | Comedies | 2017.0 | 2017 | 0 |
# Consider 2 types:
plt.figure(figsize=(10,5))
plt.pie(x=nf['type'].value_counts().sort_values(), labels=nf['type'].value_counts().index,
explode=[0.02,0], autopct = '%1.2f%%')
plt.title('Types of Content', fontsize=12, fontweight='bold')
plt.show()
#Explore country column
nf['country'].value_counts()
United States 2494
India 777
United Kingdom 347
Japan 174
Canada 141
...
United Kingdom, France, Germany 1
Poland, West Germany 1
Spain, Colombia 1
United States, United Kingdom, Denmark, Sweden 1
Thailand, United States 1
Name: country, Length: 554, dtype: int64
from collections import Counter
country_data = nf['country']
#Split the input due to lists of many countries
country = ','.join(country_data).replace(' ,',',').replace(', ',',').split(',')
count = Counter(country)
#count
country_count = pd.Series(dict(count)).sort_values(ascending=False)
#TOP 10 Countries
top10_country = country_count.head(10)
top10_country
# nf['country'].values
United States 3072 India 838 United Kingdom 601 Canada 318 France 271 Japan 229 Spain 178 South Korea 162 Germany 151 Mexico 129 dtype: int64
#Graph of top 10 countries
x = top10_country.index
y = top10_country
from matplotlib import gridspec
fig = plt.figure(figsize=(20, 6))
gs = gridspec.GridSpec(nrows=1, ncols=2, height_ratios=[6], width_ratios=[10, 5])
axes1 = plt.subplot(gs[0])
sns.barplot(x=x, y=y, ax=axes1, palette="RdGy")
axes1.set_xticklabels(x)
axes1.set_title('Top 10 countries', fontsize=15, fontweight='bold')
axes2 = plt.subplot(gs[1])
axes2.pie(y, labels=x, shadow=True, colors=sns.color_palette("RdGy", n_colors=20),
autopct='%1.2f%%')
axes2.axis('equal')
plt.show()
#Concatenate dataframes of top 10 countries from original dataset into a new data frame called top_10
top_10=nf[(nf['country']=='United States')|(nf['country']=='India')|(nf['country']=='United Kingdom')|
(nf['country']=='Japan')|(nf['country']=='Canada')|(nf['country']=='Spain')|(nf['country']=='France')|
(nf['country']=='South Korea')|(nf['country']=='Germany')|(nf['country']=='Mexico')]
top_10.head()
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 80117401 | Movie | Jandino: Whatever it Takes | United Kingdom | TV-MA | 94 | Stand-Up Comedy | 2016.0 | 2016 | 0 |
| 2 | 70234439 | TV Show | Transformers Prime | United States | TV-Y7-FV | 0 | Kids' TV | 2018.0 | 2013 | 1 |
| 3 | 80058654 | TV Show | Transformers: Robots in Disguise | United States | TV-Y7 | 0 | Kids' TV | 2018.0 | 2016 | 1 |
| 4 | 80125979 | Movie | #realityhigh | United States | TV-14 | 99 | Comedies | 2017.0 | 2017 | 0 |
| 5 | 80163890 | TV Show | Apaches | Spain | TV-MA | 0 | Crime TV Shows, International TV Shows, Spanis... | 2017.0 | 2016 | 1 |
#Graph showing the distribution of Movies and Tv Shows in each country of top 10, then the comparison
plt.figure(figsize=(15,6))
sns.countplot(x='country',hue='type',data=top_10, order=top10_country.index, palette="rocket")
plt.title('Comparison between 2 Types of Top 10 Countries',fontweight='bold')
plt.show()
#Create 2 new DF to seperate TV Shows and Movies
nf_tv = nf[nf['type']=='TV Show']
nf_movie = nf[nf['type']=='Movie']
# nf_movie.head()
#Overall the occurence of each year that the contents were added
nf['year_added'].value_counts()
2019.0 2349 2018.0 1781 2017.0 1297 2016.0 453 2020.0 184 2015.0 88 2014.0 25 2011.0 13 2013.0 12 2012.0 7 2009.0 2 2008.0 2 2010.0 1 Name: year_added, dtype: int64
#Create df for the above information
nf_content = nf['year_added'].value_counts().reset_index().rename(columns={'index':'year_added', 'year_added':'count'})
#Add percent column to get better view of how much the year accounts for
nf_content = nf_content.sort_values('year_added')
nf_content['percent'] = nf_content['count'].apply(lambda x : 100*x/sum(nf_content['count']))
nf_content
| year_added | count | percent | |
|---|---|---|---|
| 11 | 2008.0 | 2 | 0.032185 |
| 10 | 2009.0 | 2 | 0.032185 |
| 12 | 2010.0 | 1 | 0.016093 |
| 7 | 2011.0 | 13 | 0.209205 |
| 9 | 2012.0 | 7 | 0.112649 |
| 8 | 2013.0 | 12 | 0.193112 |
| 6 | 2014.0 | 25 | 0.402317 |
| 5 | 2015.0 | 88 | 1.416157 |
| 3 | 2016.0 | 453 | 7.289990 |
| 2 | 2017.0 | 1297 | 20.872224 |
| 1 | 2018.0 | 1781 | 28.661088 |
| 0 | 2019.0 | 2349 | 37.801738 |
| 4 | 2020.0 | 184 | 2.961056 |
movie = nf_movie['year_added'].value_counts().reset_index().rename(columns = { 'index' : 'year_added',
'year_added' : 'count', }).sort_values('year_added')
movie['percent'] = movie['count'].apply(lambda x : 100*x/sum(movie['count']))
movie
| year_added | count | percent | |
|---|---|---|---|
| 12 | 2008.0 | 1 | 0.023491 |
| 10 | 2009.0 | 2 | 0.046981 |
| 11 | 2010.0 | 1 | 0.023491 |
| 7 | 2011.0 | 13 | 0.305379 |
| 9 | 2012.0 | 4 | 0.093963 |
| 8 | 2013.0 | 6 | 0.140944 |
| 6 | 2014.0 | 19 | 0.446324 |
| 5 | 2015.0 | 56 | 1.315480 |
| 3 | 2016.0 | 262 | 6.154569 |
| 2 | 2017.0 | 910 | 21.376556 |
| 1 | 2018.0 | 1290 | 30.303030 |
| 0 | 2019.0 | 1546 | 36.316655 |
| 4 | 2020.0 | 147 | 3.453136 |
tv = nf_tv['year_added'].value_counts().reset_index().rename(columns={'index':'year_added',
'year_added':'count'}).sort_values('year_added')
tv['percent'] = tv['count'].apply(lambda x: 100*x/sum(tv['count']))
tv
| year_added | count | percent | |
|---|---|---|---|
| 9 | 2008.0 | 1 | 0.051099 |
| 8 | 2012.0 | 3 | 0.153296 |
| 7 | 2013.0 | 6 | 0.306592 |
| 6 | 2014.0 | 6 | 0.306592 |
| 5 | 2015.0 | 32 | 1.635156 |
| 3 | 2016.0 | 191 | 9.759836 |
| 2 | 2017.0 | 387 | 19.775166 |
| 1 | 2018.0 | 491 | 25.089423 |
| 0 | 2019.0 | 803 | 41.032192 |
| 4 | 2020.0 | 37 | 1.890649 |
#Create interactive graph to visualize the numbers
import plotly.graph_objects as go
p1 = go.Scatter(x=movie['year_added'], y=movie["count"], name="Movies", marker=dict(color="#a678de"))
p2 = go.Scatter(x=tv['year_added'], y=tv["count"], name="TV Shows", marker=dict(color="#6ad49b"))
p3 = go.Scatter(x=nf_content['year_added'], y=nf_content["count"], name="Total Contents", marker=dict(color="brown"))
layout = go.Layout(title="<b>Content Added over Years", title_x=0.5,
font=dict(family="Arial",size=10, color='black'),
legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data=[p1,p2,p3], layout=layout)
fig.show()
plt.figure(figsize=(15,4))
sns.countplot(x='year_release', hue='type', data=nf[nf['year_release']>2000])
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=90)
plt.title('Contents Released over Years',fontweight='bold')
plt.show()
# Understanding Netflix rating based on ages
ages = {'TV-PG': 'Kids',
'TV-MA': 'Adults',
'TV-Y7-FV': 'Kids',
'TV-Y7': 'Kids',
'TV-14': 'Teens',
'R': 'Adults',
'TV-Y': 'Kids',
'NR': 'Adults',
'PG-13': 'Teens',
'TV-G': 'Kids',
'PG': 'Kids',
'G': 'Kids',
'UR': 'Adults',
'NC-17': 'Adults'
}
#Replace the existed rating with an easier term as in the list
nf['ages'] = nf['rating'].replace(ages)
nf['ages']
0 Kids
1 Adults
2 Kids
3 Kids
4 Teens
...
6218 Kids
6219 Kids
6220 Kids
6221 Teens
6222 Adults
Name: ages, Length: 6214, dtype: object
# Group nf['rating'] and ['ages'], count number of show_id(s) in them
rating_nf = nf.groupby(['rating', 'ages']).agg({'show_id': 'count'}).reset_index()
# Change column's name
rating_nf.columns = ['rating', 'ages', 'counts']
# Group data by ages
rating_nf = rating_nf.sort_values('ages')
rating_nf
| rating | ages | counts | |
|---|---|---|---|
| 1 | NC-17 | Adults | 2 |
| 2 | NR | Adults | 217 |
| 5 | R | Adults | 508 |
| 8 | TV-MA | Adults | 2025 |
| 13 | UR | Adults | 7 |
| 0 | G | Kids | 37 |
| 3 | PG | Kids | 184 |
| 7 | TV-G | Kids | 149 |
| 9 | TV-PG | Kids | 699 |
| 10 | TV-Y | Kids | 142 |
| 11 | TV-Y7 | Kids | 168 |
| 12 | TV-Y7-FV | Kids | 95 |
| 4 | PG-13 | Teens | 286 |
| 6 | TV-14 | Teens | 1695 |
#List of Netflix ratings
group_ages = ["G", "TV-G", "TV-Y", "PG", "TV-PG", "TV-Y7", "TV-Y7-FV",
"PG-13", "TV-14", 'NC-17', "NR", "R", "TV-MA", "UR"]
#Graph showing the distribution of Netflix ratings which are colored by normal terms
plt.figure(figsize=(12,4))
plt.grid('whitegrid')
sns.barplot(x='rating', y='counts', data=rating_nf, hue='ages', order=group_ages, palette="viridis", dodge=False)
plt.title("Distribution of Ratings", fontweight='bold')
Text(0.5, 1.0, 'Distribution of Ratings')
#Recreat 2 dfs with updated ages column
nf_tv = nf[nf['type']=='TV Show']
nf_movie = nf[nf['type']=='Movie']
nf_movie.head()
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | ages | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81145628 | Movie | Norm of the North: King Sized Adventure | United States, India, South Korea, China | TV-PG | 90 | Children & Family Movies, Comedies | 2019.0 | 2019 | 0 | Kids |
| 1 | 80117401 | Movie | Jandino: Whatever it Takes | United Kingdom | TV-MA | 94 | Stand-Up Comedy | 2016.0 | 2016 | 0 | Adults |
| 4 | 80125979 | Movie | #realityhigh | United States | TV-14 | 99 | Comedies | 2017.0 | 2017 | 0 | Teens |
| 6 | 70304989 | Movie | Automata | Bulgaria, United States, Spain, Canada | R | 110 | International Movies, Sci-Fi & Fantasy, Thrillers | 2017.0 | 2014 | 0 | Adults |
| 7 | 80164077 | Movie | Fabrizio Copano: Solo pienso en mi | Chile | TV-MA | 60 | Stand-Up Comedy | 2017.0 | 2017 | 0 | Adults |
#DF of movie rating with different ages and count
movie_rating = nf_movie.groupby(['ages']).agg({'show_id': 'count'}).reset_index()
movie_rating.columns = ['ages', 'count']
movie_rating = movie_rating.sort_values('ages')
movie_rating
| ages | count | |
|---|---|---|
| 0 | Adults | 2065 |
| 1 | Kids | 868 |
| 2 | Teens | 1324 |
#DF of TV Shows rating with different ages and count
tv_rating = nf_tv.groupby(['ages']).agg({'show_id': 'count'}).reset_index()
tv_rating.columns = ['ages', 'count']
tv_rating = tv_rating.sort_values('ages')
tv_rating
| ages | count | |
|---|---|---|
| 0 | Adults | 694 |
| 1 | Kids | 606 |
| 2 | Teens | 657 |
#Graph showing distribution of ages in each type, then comparison
fig,axes = plt.subplots(nrows=1,ncols=2, figsize=(15,8))
axes[0].pie(x=movie_rating['count'], labels=movie_rating['ages'], autopct='%1.2f%%')
axes[0].set_title('Distribution of Movies Rating', fontweight="bold", y=-0.01)
axes[1].pie(x=tv_rating['count'], labels=tv_rating['ages'], autopct='%1.2f%%')
axes[1].set_title('Distribution of TV Shows Rating', fontweight="bold", y=-0.01)
Text(0.5, -0.01, 'Distribution of TV Shows Rating')
#DF of duration in min
duration_movie = nf[nf['duration'] != 0]
duration_movie.head(2)
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | ages | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81145628 | Movie | Norm of the North: King Sized Adventure | United States, India, South Korea, China | TV-PG | 90 | Children & Family Movies, Comedies | 2019.0 | 2019 | 0 | Kids |
| 1 | 80117401 | Movie | Jandino: Whatever it Takes | United Kingdom | TV-MA | 94 | Stand-Up Comedy | 2016.0 | 2016 | 0 | Adults |
duration_movie[duration_movie["Season"] == 1].info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 0 entries Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 0 non-null int64 1 type 0 non-null object 2 title 0 non-null object 3 country 0 non-null object 4 rating 0 non-null object 5 duration 0 non-null int64 6 listed_in 0 non-null object 7 year_added 0 non-null float64 8 year_release 0 non-null int64 9 Season 0 non-null int64 10 ages 0 non-null object dtypes: float64(1), int64(4), object(6) memory usage: 0.0+ bytes
#Graph showing distribution of length of movie, distinguished by the normal ages
plt.figure(figsize=(12,4))
plt.grid('whitegrid')
sns.histplot(x='duration', data=duration_movie, bins=30, hue='ages', palette="viridis")
plt.title('Duration of Movies',fontweight="bold")
plt.show()
duration_tv = nf[nf['Season'] != 0]
duration_tv.head(2)
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | ages | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 70234439 | TV Show | Transformers Prime | United States | TV-Y7-FV | 0 | Kids' TV | 2018.0 | 2013 | 1 | Kids |
| 3 | 80058654 | TV Show | Transformers: Robots in Disguise | United States | TV-Y7 | 0 | Kids' TV | 2018.0 | 2016 | 1 | Kids |
#Graph showing distribution of length of movie, distinguished by the normal ages
plt.figure(figsize=(12,4))
plt.grid('whitegrid')
sns.countplot(x='Season', data=duration_tv, hue='ages', palette="viridis")
plt.title('Duration of TV Shows',fontweight="bold")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
#Import mlb from scikit-learn
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
#Fuction to find number of different kinds("listed_in") of each content(movies/TV Shows)
def count(nf, content):
#Split the list in original content
nf['genre'] = nf['listed_in'].apply(lambda x : x.replace(' ,',',').replace(', ',',').split(','))
#nf['genre'] contains lists --> loop through to get number of unique
kinds = []
for i in nf['genre']:
kinds += i
kinds = set(kinds)
return "There are {} types in the Netflix {} Dataset".format(len(kinds),content)
#Fuction for relation heatmap showing the relationship between different kinds
def relation_heatmap(nf, content):
#Fit the label sets binarizer and transform the given label sets.
x = mlb.fit_transform(nf['genre'])
#A copy of the classes parameter when provided
y = mlb.classes_
#Create new df for correlation
df = pd.DataFrame(x, columns=y, index=nf['genre'].index)
corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10, 7))
pl = sns.heatmap(corr, mask = mask, cmap= "viridis", vmin=-.5, vmax=.5, square=True, linewidths=.7)
plt.title(content + " Genre", fontweight='bold')
plt.show()
count(nf_movie, 'Movie')
<ipython-input-58-177eb72f2b81>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
'There are 20 types in the Netflix Movie Dataset'
relation_heatmap(nf_movie, 'Movie')
count(nf_tv, 'TV Show')
<ipython-input-58-177eb72f2b81>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
'There are 22 types in the Netflix TV Show Dataset'
relation_heatmap(nf_tv, 'TV Show')
#Graph showing distribution of the most 10 popular kinds of each content
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
#10 Movie Genres
sns.barplot(ax=axes[0],
x = nf_movie["listed_in"].value_counts().head(10).index,
y = nf_movie["listed_in"].value_counts().head(10).values,palette="RdGy")
axes[0].set_title("Top 10 Genre in Movies", fontweight='bold')
#10 TV Show Genres
sns.barplot(ax=axes[1],
x = nf_tv["listed_in"].value_counts().head(10).index,
y = nf_tv["listed_in"].value_counts().head(10).values,palette="RdGy")
axes[1].set_title("Top 10 Genre in TV Shows", fontweight='bold')
#Rotate text in x-axis for better look
for ax in fig.axes:
plt.sca(ax)
plt.xticks(rotation=80)
plt.show()